import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# 目标URL
base_url = 'https://www.myquant.cn/docs2/sdk/python/%E5%BF%AB%E9%80%9F%E5%BC%80%E5%A7%8B.html'

# 用于存储已访问的URL，避免重复抓取
visited_urls = set()

def fetch_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.content
    else:
        print(f"请求失败，状态码: {response.status_code}")
        return None

def parse_page(content):
    soup = BeautifulSoup(content, 'html.parser')
    return soup

def extract_links(soup, base_url):
    links = []
    for a_tag in soup.find_all('a', href=True):
        full_url = urljoin(base_url, a_tag['href'])
        links.append(full_url)
    return links

def save_content(url, content):
    with open(f"content_{len(visited_urls)}.txt", 'w', encoding='utf-8') as file:
        file.write(f"URL: {url}\n")
        file.write(content)

def crawl(url):
    if url in visited_urls:
        return
    visited_urls.add(url)
    
    print(f"正在抓取: {url}")
    content = fetch_page(url)
    if content:
        soup = parse_page(content)
        text_content = soup.get_text()
        save_content(url, text_content)
        
        links = extract_links(soup, url)
        for link in links:
            crawl(link)

# 开始抓取
crawl(base_url)